In [1]:
    
% matplotlib inline
import pandas as pd
from dateutil.relativedelta import relativedelta
import statsmodels.formula.api as sm
import requests
import pickle
from user_object import User
    
Our measures of user activity over a time span include:
Our measures of harassment received/made over a time span are:
clf scored above thresholdthreshold for any of our 3 harassment classifersthreshold for any of our 3 harassment classifiersWe also gather:
As mentioned above we, gather activity and harassment features for newcomers in timespan t1 and see how they correlate with activity features in timespan t2.
In the following analysis, the two time spans we are interested in are the first and second month after user registration.
In [2]:
    
def select_month_since_start(user,  activity, t):
    start = user.first_edit_day + relativedelta(months=(t-1))
    stop = user.first_edit_day + relativedelta(months= t)
    activity = activity[activity['timestamp'] < stop]
    activity = activity[activity['timestamp'] >= start]
    return activity
def count_edits(user, t):
    activity = user.df_activity
    if activity is None:
        return 0
    activity = select_month_since_start(user,  activity, t)
    return activity['n_revisions'].sum()
def count_ns0_revisions(user, t):
    activity = user.df_activity
    if activity is None:
        return 0
    activity = select_month_since_start(user,  activity, t)
    activity = activity.query("ns=='0'")
    return activity['n_revisions'].sum()
def count_days_active(user, t):
    activity = user.df_activity
    if user.df_activity is None:
        return 0
    activity = select_month_since_start(user,  activity, t)
    return len(activity.timestamp.unique())
def count_score_received_above_threshold(user, score, threshold, t):
    if user.df_comments_to is None:
        return 0
    
    comments = user.df_comments_to
    comments = select_month_since_start(user,  comments, t)
    return (comments[score] > threshold).sum()
def count_score_made_above_threshold(user, score, threshold, t):
    if user.df_comments_from is None:
        return 0
    
    comments = user.df_comments_from
    comments = select_month_since_start(user,  comments, t)
    return (comments[score] > threshold).sum()
def is_female(u):
    return int(u.gender == 'female')
def is_male(u):
    return int(u.gender == 'male')
def count_warnings_received(user, t):
    warnings = user.df_uw
    if warnings is None:
        return 0
    warnings = select_month_since_start(user, warnings, t)
    return len(warnings)
def count_fraction_of_ns0_revisions_x(user, x, t):
    
    if user.df_activity is None:
        return 0
    
    activity = user.df_activity.query("ns=='0'")
    activity = select_month_since_start(user,  activity, t)
        
    if activity['n_revisions'].sum() < 1:
        return 0
    
    return  float(activity[x].sum()) / activity['n_revisions'].sum()
    
In [3]:
    
feature_map = {
    'first_edit_day' : lambda u: u.first_edit_day,
    'm1_num_ns0_edits' : lambda u: count_ns0_revisions(u, 1),
    'user_id' : lambda u : u.user_id,
    'is_female' : is_female,
    'is_male' : is_male,
    'has_gender' : lambda u: int(is_female(u) or is_male(u)),
    'm1_num_edits' : lambda u: count_edits(u, 1) ,
    'm2_num_edits' : lambda u: count_edits(u, 2),
    'm1_num_days_active' : lambda u: count_days_active(u, 1),
    'm2_num_days_active' : lambda u: count_days_active(u, 2),
    'm1_num_warnings_recieved' : lambda u: count_warnings_received(u, 1),
    'm1_fraction_ns0_deleted' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_deleted_revisions', 1) ,
    'm1_fraction_ns0_reverted' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_identity_reverted_revisions', 1) ,
    'm1_fraction_ns0_productive' : lambda u:  count_fraction_of_ns0_revisions_x(u, 'n_productive_revisions', 1) ,
    'm1_active' : lambda u: int(count_edits(u, 1) > 0),
    'm2_active' : lambda u: int(count_edits(u, 2) > 0),
}
        
        
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.01)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.425)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.75)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression',  0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.85)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'aggression',  0.85, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.01)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.425)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.75)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack',  0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.85)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'attack',  0.85, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.01)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.425)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.75)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity',  0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.85)] = lambda u: count_score_made_above_threshold(u,  'pred_%s_score' % 'toxicity',  0.85, 1)
    
In [4]:
    
random_user_objects = pickle.load(open("../../data/retention/random_user_data.pkl", "rb"))
    
In [5]:
    
d = {k : [v(u) for u in random_user_objects] for k,v in feature_map.items()}
df_features = pd.DataFrame(d)
df_features.index = df_features.user_id
del df_features['user_id']
print(df_features.shape)
df_active = df_features.query('m1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/random_user_sample_features.csv")
    
    
In [6]:
    
attacked_user_objects = pickle.load(open("../../data/retention/attacked_user_data.pkl", "rb"))
    
In [7]:
    
d = {k : [v(u) for u in attacked_user_objects] for k,v in feature_map.items()}
df_features = pd.DataFrame(d)
df_features.index = df_features.user_id
del df_features['user_id']
print(df_features.shape)
df_active = df_features.query('m1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/attacked_user_sample_features.csv")